import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pandas import Series
%matplotlib inline
import gc
import seaborn as sns
import plotly.express as px
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000
pd.set_option('display.max_columns', None, 'max_colwidth', None, 'display.expand_frame_repr', False)
plt.style.use("ggplot")
df = pd.read_csv(r"C:\Users\riyad\Documents\mat005\MAT005 - Coursework data.csv")
df
| Date_Incoming_call | Incoming_Call_Time | DateTime | |
|---|---|---|---|
| 0 | 01/01/2017 | 07:28:00 | 01/01/2017 07:28 |
| 1 | 02/01/2017 | 21:25:00 | 02/01/2017 21:25 |
| 2 | 03/01/2017 | 07:10:00 | 03/01/2017 07:10 |
| 3 | 03/01/2017 | 08:20:00 | 03/01/2017 08:20 |
| 4 | 03/01/2017 | 12:14:00 | 03/01/2017 12:14 |
| ... | ... | ... | ... |
| 3737 | 09/10/2022 | 10:00:00 | 09/10/2022 10:00 |
| 3738 | 16/10/2022 | 07:02:00 | 16/10/2022 07:02 |
| 3739 | 17/10/2022 | 20:00:00 | 17/10/2022 20:00 |
| 3740 | 21/10/2022 | 05:06:00 | 21/10/2022 05:06 |
| 3741 | 21/10/2022 | 12:00:00 | 21/10/2022 12:00 |
3742 rows × 3 columns
df.dtypes
#dtypes is objects, therefore needs to be converted into datetime
Date_Incoming_call object Incoming_Call_Time object DateTime object dtype: object
df["Date_Incoming_call"] = pd.to_datetime(df["Date_Incoming_call"])
C:\Users\riyad\AppData\Local\Temp\ipykernel_6096\2563564867.py:1: UserWarning: Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing. df["Date_Incoming_call"] = pd.to_datetime(df["Date_Incoming_call"])
df["DateTime"] = pd.to_datetime(df["DateTime"])
df["DateTime"]
0 2017-01-01 07:28:00
1 2017-02-01 21:25:00
2 2017-03-01 07:10:00
3 2017-03-01 08:20:00
4 2017-03-01 12:14:00
...
3737 2022-09-10 10:00:00
3738 2022-10-16 07:02:00
3739 2022-10-17 20:00:00
3740 2022-10-21 05:06:00
3741 2022-10-21 12:00:00
Name: DateTime, Length: 3742, dtype: datetime64[ns]
#df["Incoming_Call_Time"] = pd.to_datetime(df["Incoming_Call_Time"], format='%H:%M:%S').dt.time
#pd.to_datetime(day1['time'], format='%H:%M').dt.time '%H:%M:%S'
#df.describe(datetime_is_numeric=True)
print(df.isnull().sum())
Date_Incoming_call 0 Incoming_Call_Time 8 DateTime 0 dtype: int64
nulldat = pd.isnull(df["Incoming_Call_Time"])
df[nulldat]
| Date_Incoming_call | Incoming_Call_Time | DateTime | |
|---|---|---|---|
| 454 | 2017-05-30 | NaN | 2017-05-30 |
| 1272 | 2018-08-06 | NaN | 2018-08-06 |
| 2301 | 2019-03-10 | NaN | 2019-03-10 |
| 2304 | 2019-05-10 | NaN | 2019-05-10 |
| 2308 | 2019-08-10 | NaN | 2019-08-10 |
| 2322 | 2019-10-19 | NaN | 2019-10-19 |
| 2335 | 2019-10-28 | NaN | 2019-10-28 |
| 2338 | 2019-10-29 | NaN | 2019-10-29 |
#print(df["Date_Incoming_call"].describe(datetime_is_numeric=True))
#print(df["Incoming_Call_Time"].describe(datetime_is_numeric=True))
df["DateTime"] = df["Date_Incoming_call"] + df["Incoming_Call_Time"]
df[nulldat]
| Date_Incoming_call | Incoming_Call_Time | DateTime | |
|---|---|---|---|
| 454 | 2017-05-30 | NaN | 2017-05-30 |
| 1272 | 2018-08-06 | NaN | 2018-08-06 |
| 2301 | 2019-03-10 | NaN | 2019-03-10 |
| 2304 | 2019-05-10 | NaN | 2019-05-10 |
| 2308 | 2019-08-10 | NaN | 2019-08-10 |
| 2322 | 2019-10-19 | NaN | 2019-10-19 |
| 2335 | 2019-10-28 | NaN | 2019-10-28 |
| 2338 | 2019-10-29 | NaN | 2019-10-29 |
# see if there is any data in the dates where time is missing
print(df.loc[df["Date_Incoming_call"]=="2017-05-30"])
print(df.loc[df["Date_Incoming_call"]=="2018-08-06"])
print(df.loc[df["Date_Incoming_call"]=="2019-03-10"])
print(df.loc[df["Date_Incoming_call"]=="2019-05-10"])
print(df.loc[df["Date_Incoming_call"]=="2019-10-19"])
print(df.loc[df["Date_Incoming_call"]=="2019-10-28"])
print(df.loc[df["Date_Incoming_call"]=="2019-10-29"])
Date_Incoming_call Incoming_Call_Time DateTime
452 2017-05-30 08:00:00 2017-05-30 08:00:00
453 2017-05-30 19:10:00 2017-05-30 19:10:00
454 2017-05-30 NaN 2017-05-30 00:00:00
Date_Incoming_call Incoming_Call_Time DateTime
1269 2018-08-06 06:05:00 2018-08-06 06:05:00
1270 2018-08-06 10:50:00 2018-08-06 10:50:00
1271 2018-08-06 11:55:00 2018-08-06 11:55:00
1272 2018-08-06 NaN 2018-08-06 00:00:00
Date_Incoming_call Incoming_Call_Time DateTime
2299 2019-03-10 06:00:00 2019-03-10 06:00:00
2300 2019-03-10 17:45:00 2019-03-10 17:45:00
2301 2019-03-10 NaN 2019-03-10 00:00:00
Date_Incoming_call Incoming_Call_Time DateTime
2304 2019-05-10 NaN 2019-05-10
Date_Incoming_call Incoming_Call_Time DateTime
2322 2019-10-19 NaN 2019-10-19
Date_Incoming_call Incoming_Call_Time DateTime
2333 2019-10-28 08:50:00 2019-10-28 08:50:00
2334 2019-10-28 09:00:00 2019-10-28 09:00:00
2335 2019-10-28 NaN 2019-10-28 00:00:00
Date_Incoming_call Incoming_Call_Time DateTime
2336 2019-10-29 08:45:00 2019-10-29 08:45:00
2337 2019-10-29 11:30:00 2019-10-29 11:30:00
2338 2019-10-29 NaN 2019-10-29 00:00:00
df =df.dropna().reset_index(drop=True)
sns.displot(df["DateTime"],color="m")
plt.show()
sns.kdeplot(df["DateTime"],color="m" ,shade =True)
plt.title("Number of Calls")
plt.show()
#null values are removed, cleaned data will be exported back to excel to use pivot tables
#df.to_csv('cleaned_data.csv')
clean_data = pd.read_csv(r"C:\Users\riyad\Documents\mat005\sumdat.csv")
clean_data.dtypes
Date_Of_Call object Count of Date_Incoming_call int64 dtype: object
clean_data["Date_Of_Call"] = pd.to_datetime(clean_data["Date_Of_Call"])
C:\Users\riyad\AppData\Local\Temp\ipykernel_6096\1168944757.py:1: UserWarning: Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing. clean_data["Date_Of_Call"] = pd.to_datetime(clean_data["Date_Of_Call"])
clean_data.describe()
| Count of Date_Incoming_call | |
|---|---|
| count | 1584.000000 |
| mean | 2.357323 |
| std | 1.364137 |
| min | 1.000000 |
| 25% | 1.000000 |
| 50% | 2.000000 |
| 75% | 3.000000 |
| max | 11.000000 |
clean_data.set_index(["Date_Of_Call"])
| Count of Date_Incoming_call | |
|---|---|
| Date_Of_Call | |
| 2017-01-01 | 1 |
| 2017-01-02 | 1 |
| 2017-01-03 | 5 |
| 2017-01-04 | 3 |
| 2017-01-05 | 7 |
| ... | ... |
| 2022-10-07 | 1 |
| 2022-10-09 | 1 |
| 2022-10-16 | 1 |
| 2022-10-17 | 1 |
| 2022-10-21 | 2 |
1584 rows × 1 columns
x = clean_data["Date_Of_Call"]
y = clean_data["Count of Date_Incoming_call"]
plt.figure(figsize=(35,10))
plt.plot(x,y)
#just a general plot to see data, it is congested so we will look at monthly and yearly scale
[<matplotlib.lines.Line2D at 0x1f086413040>]
plt.figure(figsize=(20,5))
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('M').mean()["Count of Date_Incoming_call"].plot()
plt.ylabel("Mean Number of Calls")
plt.xlabel("Date")
plt.title("Average Monthly Calls")
plt.show()
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('Y').mean()["Count of Date_Incoming_call"].plot()
plt.ylabel("Mean Number of Calls")
plt.xlabel("Date")
plt.title("Average Yearly Calls")
plt.show()
plt.figure(figsize=(20,5))
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('W').mean()["Count of Date_Incoming_call"].plot()
plt.ylabel("Mean Number of Calls")
plt.xlabel("Date")
plt.title("Average Weekly Calls")
plt.show()
plt.figure(figsize=(20,5))
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('M').std()["Count of Date_Incoming_call"].plot()
plt.ylabel("Standard Deviation of Calls")
plt.xlabel("Date")
plt.title("Standard Deviation of Monthly Calls")
plt.show()
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('Y').std()["Count of Date_Incoming_call"].plot()
plt.ylabel("Standard Deviation of Calls")
plt.xlabel("Date")
plt.title("Standard Deviation of Yearly Calls")
plt.show()
plt.figure(figsize=(20,5))
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('W').std()["Count of Date_Incoming_call"].plot()
plt.ylabel("Standard Deviation of Calls")
plt.xlabel("Date")
plt.title("Standard Deviation of Weekly Calls")
plt.show()
clean_data["Week"] = clean_data["Date_Of_Call"].dt.isocalendar().week
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('M').sum()["Count of Date_Incoming_call"].plot()
plt.ylabel("Sum Number of Calls")
plt.xlabel("Date")
plt.title("Monthly Calls")
plt.figure(figsize=(35,8))
plt.show()
<Figure size 3500x800 with 0 Axes>
Attached below:
Monthly Trend of 2017 Calls
Monthly Trend of 2018 Calls
Monthly Trend of 2019 Calls
Monthly Trend of 2020 Calls
Monthly Trend of 2021 Calls
Monthly Trend of 2022 Calls
Quarterly Range
#convert data to monthly scale
clean_data.set_index("Date_Of_Call", inplace=True)
monthly_data = clean_data.resample('M').mean()["Count of Date_Incoming_call"]
monthly_data.plot()
<AxesSubplot:xlabel='Date_Of_Call'>
m3 = pd.read_excel(r"C:\Users\riyad\Documents\mat005\cleaned_data.xlsx", sheet_name="new_month_data")
m3.set_index("Month")
| Sum_Calls | |
|---|---|
| Month | |
| 2017-01-01 | 91 |
| 2017-02-01 | 93 |
| 2017-03-01 | 87 |
| 2017-04-01 | 76 |
| 2017-05-01 | 84 |
| ... | ... |
| 2022-06-01 | 32 |
| 2022-07-01 | 38 |
| 2022-08-01 | 69 |
| 2022-09-01 | 42 |
| 2022-10-01 | 15 |
70 rows × 1 columns
m3["roll_mean"] = m3["Sum_Calls"].rolling(window=12).mean()
m3["roll_std"] = m3["Sum_Calls"].rolling(window=12).std()
m3.head()
| Month | Sum_Calls | roll_mean | roll_std | |
|---|---|---|---|---|
| 0 | 2017-01-01 | 91 | NaN | NaN |
| 1 | 2017-02-01 | 93 | NaN | NaN |
| 2 | 2017-03-01 | 87 | NaN | NaN |
| 3 | 2017-04-01 | 76 | NaN | NaN |
| 4 | 2017-05-01 | 84 | NaN | NaN |
x = m3["Month"]
y = m3["Sum_Calls"]
plt.ylabel("Sum of Calls per Month")
plt.xlabel("Date")
normal =plt.plot(x,y,"c",label ="Data")
import statsmodels.api as sm
plt.figure(figsize=(20,5))
#Normal Monthly Data
sns.lineplot( x = "Month",y = "Sum_Calls", data = m3, label = 'Volume of Calls')
plt.xlabel = "Data Range"
plt.ylabel = "Volume of Calls"
#Rolling Mean
sns.lineplot( x = "Month", y = "roll_mean", data = m3, label = 'Rolling Mean')
#Rolling Std
sns.lineplot( x = "Month", y = "roll_std", data = m3, label = 'Rolling Standard Deviation')
plt.xlabel = "Data Range"
plt.ylabel = "Volume of Calls"
plt.show()
#data is non-stationary as evident above
#apply dickey-fuller test for stationary
from statsmodels.tsa.stattools import adfuller
dftest = adfuller(m3["Sum_Calls"],autolag ="AIC")
dfresult = pd.Series(dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in dftest[4].items():
dfresult["Critical Value (%s)"%key] = value
print(dfresult)
ADF Statistic: -2.423050 p-value 0.135336 Number of Lags 1.000000 Number of Observations Used 68.000000 Critical Value (1%) -3.530399 Critical Value (5%) -2.905087 Critical Value (10%) -2.590001 dtype: float64
we can see that from the test, the p value is greater than 0.05, therefore the data is non-stationary
drop_data = m3.drop(columns= ["roll_mean","roll_std"]).copy()
drop_data["Log_Values"]=np.log(drop_data["Sum_Calls"])
drop_data.dtypes
Month datetime64[ns] Sum_Calls int64 Log_Values float64 dtype: object
drop_data = drop_data.set_index("Month") drop_data["Sum_Calls"] =drop_data["Sum_Calls"].resample('W').sum() drop_data
#drop_data["Log_Values"]=np.log(drop_data["Sum_Calls"])
df3 =clean_data.copy()
df3
| Count of Date_Incoming_call | |
|---|---|
| Date_Of_Call | |
| 2017-01-01 | 1 |
| 2017-01-02 | 1 |
| 2017-01-03 | 5 |
| 2017-01-04 | 3 |
| 2017-01-05 | 7 |
| ... | ... |
| 2022-10-07 | 1 |
| 2022-10-09 | 1 |
| 2022-10-16 | 1 |
| 2022-10-17 | 1 |
| 2022-10-21 | 2 |
1584 rows × 1 columns
df3.dtypes
Count of Date_Incoming_call int64 dtype: object
df3 =df3.resample("W").sum()
df3
| Count of Date_Incoming_call | |
|---|---|
| Date_Of_Call | |
| 2017-01-01 | 1 |
| 2017-01-08 | 25 |
| 2017-01-15 | 22 |
| 2017-01-22 | 22 |
| 2017-01-29 | 21 |
| ... | ... |
| 2022-09-25 | 15 |
| 2022-10-02 | 7 |
| 2022-10-09 | 3 |
| 2022-10-16 | 1 |
| 2022-10-23 | 3 |
304 rows × 1 columns
drop_data = m3.drop(columns= ["roll_mean","roll_std"]).copy()
#drop_data = drop_data.set_index("Month")
drop_data["Log_Values"]=np.log(drop_data["Sum_Calls"])
log_data= drop_data[["Month","Log_Values"]].copy()
log_data=log_data.set_index("Month")
plt.plot(log_data)
[<matplotlib.lines.Line2D at 0x1f08a2fa4c0>]
logmean = log_data.rolling(window=12).mean()
logstd = log_data.rolling(window=12).std()
plt.plot(log_data,label="Log Value")
plt.plot(logmean, label="Log Rolling Mean")
plt.plot(logstd,label="Log Rolling Std")
plt.legend()
<matplotlib.legend.Legend at 0x1f09056af10>
plt.figure(figsize=(16,5))
ma_diff = log_data - logmean
ma_diff.dropna(inplace=True)
madiffmean= ma_diff.rolling(window=12).mean()
madiffstd = ma_diff.rolling(window=12).std()
plt.plot(ma_diff, label="Difference")
plt.plot(madiffmean, label="Rolling Mean")
plt.plot(madiffstd, label="Rolling STD")
plt.legend()
<matplotlib.legend.Legend at 0x1f08a2bac70>
#ADF Test for log transformation
log_dftest = adfuller(ma_diff,autolag ="AIC")
log_dfresult = pd.Series(log_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in log_dftest[4].items():
log_dfresult["Critical Value (%s)"%key] = value
print(log_dfresult)
ADF Statistic: -3.286675 p-value 0.015485 Number of Lags 2.000000 Number of Observations Used 56.000000 Critical Value (1%) -3.552928 Critical Value (5%) -2.914731 Critical Value (10%) -2.595137 dtype: float64
log_data
| Log_Values | |
|---|---|
| Month | |
| 2017-01-01 | 4.510860 |
| 2017-02-01 | 4.532599 |
| 2017-03-01 | 4.465908 |
| 2017-04-01 | 4.330733 |
| 2017-05-01 | 4.430817 |
| ... | ... |
| 2022-06-01 | 3.465736 |
| 2022-07-01 | 3.637586 |
| 2022-08-01 | 4.234107 |
| 2022-09-01 | 3.737670 |
| 2022-10-01 | 2.708050 |
70 rows × 1 columns
Based off the excel graphs, the overall trend per year does not show that much seasonality, except for negative trends between Jan-Feb for 2018-2020 & 2022. We can use a decomposition plot to determine seasonality
period = len(m3)/2
period
35.0
## Decomposition Plot
decomposition = sm.tsa.seasonal_decompose(drop_data["Sum_Calls"], model="additive",extrapolate_trend='freq',period =35)
decom_plot =decomposition.plot()
plt.figure(figsize=(30,6))
#we use period of 35 as we have 70 observations, dataframe is unnested, hence we use length df / 2
#https://stackoverflow.com/questions/60017052/decompose-for-time-series-valueerror-you-must-specify-a-period-or-x-must-be
<Figure size 3000x600 with 0 Axes>
<Figure size 3000x600 with 0 Axes>
In order to see the trend in the log time series, we need to take the exponential weighted average of the rolling mean
exp_avg = log_data.ewm(halflife=12,min_periods=0,ignore_na=False).mean()
plt.plot(log_data, label="Log Transformed Data")
plt.plot(exp_avg, label ="Exponential Weighted Avg")
plt.legend()
plt.title("Exp Trend")
Text(0.5, 1.0, 'Exp Trend')
exp_diff = log_data - exp_avg
exp_diff.dropna(inplace=True)
expdiffmean= exp_diff.rolling(window=12).mean()
expdiffstd = exp_diff.rolling(window=12).std()
plt.figure(figsize=(16,5))
plt.plot(exp_diff, label="De-Trended Plot")
plt.plot(expdiffmean, label="Rolling Mean")
plt.plot(expdiffstd, label="Rolling STD")
plt.legend()
plt.title("De-Trend Using Exponential Weighted Average")
exp_dftest = adfuller(exp_diff,autolag ="AIC")
exp_dfresult = pd.Series(exp_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in exp_dftest[4].items():
exp_dfresult["Critical Value (%s)"%key] = value
print(exp_dfresult)
ADF Statistic: -4.722287 p-value 0.000076 Number of Lags 0.000000 Number of Observations Used 69.000000 Critical Value (1%) -3.528890 Critical Value (5%) -2.904440 Critical Value (10%) -2.589656 dtype: float64
here we have removed trend from the log transformation of the time series
ts_diff = log_data-log_data.shift(12)
plt.plot(ts_diff)
[<matplotlib.lines.Line2D at 0x1f08a2874c0>]
here the data has been shifted by 1 lag
ts_diff.dropna(inplace=True)
ts_diffmean= ts_diff.rolling(window=12).mean()
ts_diffstd = ts_diff.rolling(window=12).std()
plt.figure(figsize=(16,5))
plt.plot(ts_diff, label="Shifted Plot")
plt.plot(ts_diffmean, label="Rolling Mean")
plt.plot(ts_diffstd, label="Rolling STD")
plt.legend()
plt.title("Log Difference Shift")
ts_dftest = adfuller(ts_diff,autolag ="AIC")
ts_dfresult = pd.Series(ts_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in ts_dftest[4].items():
ts_dfresult["Critical Value (%s)"%key] = value
print(ts_dfresult)
ADF Statistic: -3.171402 p-value 0.021701 Number of Lags 0.000000 Number of Observations Used 57.000000 Critical Value (1%) -3.550670 Critical Value (5%) -2.913766 Critical Value (10%) -2.594624 dtype: float64
y = drop_data["Sum_Calls"]
y_diff = y-y.shift(12)
plt.plot(y_diff)
[<matplotlib.lines.Line2D at 0x1f08bb41310>]
y_diff.dropna(inplace=True)
y_diffmean= y_diff.rolling(window=12).mean()
y_diffstd = y_diff.rolling(window=12).std()
plt.figure(figsize=(16,5))
plt.plot(y_diff, label="Shifted Plot")
plt.plot(y_diffmean, label="Rolling Mean")
plt.plot(y_diffstd, label="Rolling STD")
plt.legend()
plt.title("Shifting Via Original Data")
y_dftest = adfuller(y_diff,autolag ="AIC")
y_dfresult = pd.Series(y_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in y_dftest[4].items():
y_dfresult["Critical Value (%s)"%key] = value
print(y_dfresult)
ADF Statistic: -3.393997 p-value 0.011165 Number of Lags 0.000000 Number of Observations Used 57.000000 Critical Value (1%) -3.550670 Critical Value (5%) -2.913766 Critical Value (10%) -2.594624 dtype: float64
y_trend = (y - y.rolling(window=12).mean())/y.rolling(window=12).std()
plt.plot(y_trend)
[<matplotlib.lines.Line2D at 0x1f08bba7ee0>]
y_trend.dropna(inplace=True)
y_trendmean= y_trend.rolling(window=12).mean()
y_trendstd =y_trend.rolling(window=12).std()
plt.figure(figsize=(16,5))
plt.plot(y_trend, label="De-Trended Plot")
plt.plot(y_trendmean, label="Rolling Mean")
plt.plot(y_trendstd, label="Rolling STD")
plt.legend()
plt.title("De-Trending with Original Data")
y_trend_dftest = adfuller(y_trend,autolag ="AIC")
y_trend_dfresult = pd.Series(y_trend_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in y_trend_dftest[4].items():
y_trend_dfresult["Critical Value (%s)"%key] = value
print(y_trend_dfresult)
ADF Statistic: -3.299364 p-value 0.014907 Number of Lags 2.000000 Number of Observations Used 56.000000 Critical Value (1%) -3.552928 Critical Value (5%) -2.914731 Critical Value (10%) -2.595137 dtype: float64
log_decomp = sm.tsa.seasonal_decompose(log_data, model="multiplicative",extrapolate_trend='freq',period =35)
log_decomp_plot =log_decomp.plot()
ts_log_decomp = log_decomp.resid
ts_log_decomp.dropna(inplace=True)
ts_log_decompmean= ts_log_decomp.rolling(window=12).mean()
ts_log_decompstd = ts_log_decomp.rolling(window=12).std()
plt.figure(figsize=(16,5))
plt.title("Test For Noise Using Residuals")
plt.plot(ts_log_decomp, label="Difference")
plt.plot(ts_log_decompmean, label="Rolling Mean")
plt.plot(ts_log_decompstd, label="Rolling STD")
plt.legend()
ts_log_decomp_dftest = adfuller(ts_log_decomp,autolag ="AIC")
ts_log_decomp_dfresult = pd.Series(ts_log_decomp_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in ts_log_decomp_dftest[4].items():
ts_log_decomp_dfresult["Critical Value (%s)"%key] = value
print(ts_log_decomp_dfresult)
ADF Statistic: -3.184199 p-value 0.020918 Number of Lags 1.000000 Number of Observations Used 68.000000 Critical Value (1%) -3.530399 Critical Value (5%) -2.905087 Critical Value (10%) -2.590001 dtype: float64
Data is not stationary as the test statistic is greater than the 1% critical value, as we use the residuals to measure irregularities in the data
ACF & PACF will be implemented to further test seasonality
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(ts_diff)
plot_pacf(ts_diff)
C:\Users\riyad\anaconda3\lib\site-packages\statsmodels\graphics\tsaplots.py:348: FutureWarning: The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
warnings.warn(
df3 =clean_data.copy()
df3
| Count of Date_Incoming_call | |
|---|---|
| Date_Of_Call | |
| 2017-01-01 | 1 |
| 2017-01-02 | 1 |
| 2017-01-03 | 5 |
| 2017-01-04 | 3 |
| 2017-01-05 | 7 |
| ... | ... |
| 2022-10-07 | 1 |
| 2022-10-09 | 1 |
| 2022-10-16 | 1 |
| 2022-10-17 | 1 |
| 2022-10-21 | 2 |
1584 rows × 1 columns
df3.dtypes
Count of Date_Incoming_call int64 dtype: object
The test data should range for 8 weeks, in order to predict the next 8 weeks
#for test:
df3 =df3.resample("W").sum()
df3.tail(9)
| Count of Date_Incoming_call | |
|---|---|
| Date_Of_Call | |
| 2022-08-28 | 18 |
| 2022-09-04 | 21 |
| 2022-09-11 | 29 |
| 2022-09-18 | 19 |
| 2022-09-25 | 15 |
| 2022-10-02 | 7 |
| 2022-10-09 | 3 |
| 2022-10-16 | 1 |
| 2022-10-23 | 3 |
valid = df3.loc["2022-09-04":"2022-10-23"]
train = df3.loc["2017-01-01":"2022-09-04"]
plt.figure(figsize=(20,8))
plt.plot(train,"b",label="Train")
plt.plot(valid, "y",label="Validation")
#plt.xlabel("Datetime")
#plt.ylabel("Number of Calls")
plt.legend(loc='best')
plt.title("Test&Train Split")
plt.show()
model_df = valid.copy()
model_df["Naive_Forcast"] = train["Count of Date_Incoming_call"][len(train)-1]
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(model_df["Naive_Forcast"],"g", label="Naive forecast")
plt.legend(loc="best")
plt.title("Naive Approach")
plt.show()
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
naive_rmse = sqrt(mean_squared_error(valid["Count of Date_Incoming_call"], model_df["Naive_Forcast"]))
naive_mape = mean_absolute_error(valid["Count of Date_Incoming_call"],model_df["Naive_Forcast"])
print("RMSE:",naive_rmse)
print("MAPE:",naive_mape)
RMSE: 12.98075498574717 MAPE: 10.75
Here we will use 8 as the value of rolling window as we are forecasting 8 weeks
model_df = valid.copy()
model_df["Moving_Average"] = train["Count of Date_Incoming_call"].rolling(window=8).mean().iloc[-1]
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(model_df["Moving_Average"],"g", label="Moving Average")
plt.legend(loc="best")
plt.title("Moving Average Forecast")
plt.show()
ma_rmse = sqrt(mean_squared_error(valid["Count of Date_Incoming_call"], model_df["Moving_Average"]))
ma_mape = mean_absolute_error(valid["Count of Date_Incoming_call"],model_df["Moving_Average"])
print("RMSE:",ma_rmse)
print("MAPE:",ma_mape)
RMSE: 9.878164050065173 MAPE: 8.75
#as we experience trend in our series, SES would not be a suitable model to implement. however we will test this
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
# we need to define the value of alpha, we will set the alpha as 1/(2*m) where m = time period, our time period will be 8 weeks
m =8
alpha = 1/(2*8)
model_df =valid.copy()
ses_model = SimpleExpSmoothing(train["Count of Date_Incoming_call"]).fit(smoothing_level=alpha,optimized=False)
model_df["SES"] = ses_model.forecast(len(valid))
ses_model.summary()
| Dep. Variable: | Count of Date_Incoming_call | No. Observations: | 297 |
|---|---|---|---|
| Model: | SimpleExpSmoothing | SSE | 13102.481 |
| Optimized: | False | AIC | 1128.687 |
| Trend: | None | BIC | 1136.074 |
| Seasonal: | None | AICC | 1128.824 |
| Seasonal Periods: | None | Date: | Fri, 05 May 2023 |
| Box-Cox: | False | Time: | 12:33:26 |
| Box-Cox Coeff.: | None |
| coeff | code | optimized | |
|---|---|---|---|
| smoothing_level | 0.0625000 | alpha | False |
| initial_level | 1.0000000 | l.0 | False |
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(model_df["SES"],"g", label="Simple Exponential Smoothing")
plt.legend(loc="best")
plt.title("SES Forecast")
plt.show()
model_df["SES"]
model_df["SES"]= model_df["SES"].fillna(8.161579)
print(model_df["SES"])
Date_Of_Call 2022-09-04 8.161579 2022-09-11 8.161579 2022-09-18 8.161579 2022-09-25 8.161579 2022-10-02 8.161579 2022-10-09 8.161579 2022-10-16 8.161579 2022-10-23 8.161579 Freq: W-SUN, Name: SES, dtype: float64
ses_rmse = sqrt(mean_squared_error(valid["Count of Date_Incoming_call"], model_df["SES"]))
ses_mape = mean_absolute_error(valid["Count of Date_Incoming_call"],model_df["SES"])
print("RMSE:",ses_rmse)
print("MAPE:",ses_mape)
RMSE: 10.423659895611774 MAPE: 8.750000016454212
from statsmodels.tsa.holtwinters import ExponentialSmoothing
the model would not work as some of the weekly values equalled to zero in the train data, as there were no data for days in those weeks
train["Count of Date_Incoming_call"].loc[(train["Count of Date_Incoming_call"])==0]
Date_Of_Call 2019-02-10 0 2019-12-08 0 2019-12-15 0 2019-12-22 0 2019-12-29 0 2020-12-20 0 2020-12-27 0 2022-08-07 0 Name: Count of Date_Incoming_call, dtype: int64
date_list = [datetime(2019, 2, 10),
datetime(2019, 12, 8),
datetime(2019, 12, 15),
datetime(2019, 12, 22),
datetime(2019, 12, 29),
datetime(2020, 12, 20),
datetime(2020, 12, 27),
datetime(2022, 8, 7),
]
train_2 = train.drop(date_list)
train_2.dtypes
Count of Date_Incoming_call int64 dtype: object
train_2
| Count of Date_Incoming_call | |
|---|---|
| Date_Of_Call | |
| 2017-01-01 | 1 |
| 2017-01-08 | 25 |
| 2017-01-15 | 22 |
| 2017-01-22 | 22 |
| 2017-01-29 | 21 |
| ... | ... |
| 2022-07-31 | 7 |
| 2022-08-14 | 7 |
| 2022-08-21 | 17 |
| 2022-08-28 | 18 |
| 2022-09-04 | 21 |
289 rows × 1 columns
train_2.index=pd.DatetimeIndex(train_2.index).to_period('W')
hwse_model = ExponentialSmoothing(train_2,trend ="add",seasonal ="mul",seasonal_periods=36).fit()
#from the decomposition plot, we have 2 seasonal periods, therefore we will put this into the paramaters of the model
model_df["HWSE"] = hwse_model.forecast(len(valid))
model_df["HWSE"]
Date_Of_Call 2022-09-04 NaN 2022-09-11 NaN 2022-09-18 NaN 2022-09-25 NaN 2022-10-02 NaN 2022-10-09 NaN 2022-10-16 NaN 2022-10-23 NaN Freq: W-SUN, Name: HWSE, dtype: float64
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(model_df["HWSE"],"g", label="Holt Winter Method")
plt.legend(loc="best")
plt.title("Holt Winter Forecast")
plt.show()
hwse_model.summary()
| Dep. Variable: | Count of Date_Incoming_call | No. Observations: | 289 |
|---|---|---|---|
| Model: | ExponentialSmoothing | SSE | 5303.144 |
| Optimized: | True | AIC | 920.883 |
| Trend: | Additive | BIC | 1067.540 |
| Seasonal: | Multiplicative | AICC | 935.566 |
| Seasonal Periods: | 36 | Date: | Fri, 05 May 2023 |
| Box-Cox: | False | Time: | 12:33:27 |
| Box-Cox Coeff.: | None |
| coeff | code | optimized | |
|---|---|---|---|
| smoothing_level | 0.4495968 | alpha | True |
| smoothing_trend | 2.9807e-10 | beta | True |
| smoothing_seasonal | 3.4581e-10 | gamma | True |
| initial_level | 10.847656 | l.0 | True |
| initial_trend | 0.0213134 | b.0 | True |
| initial_seasons.0 | 1.1030563 | s.0 | True |
| initial_seasons.1 | 1.0197384 | s.1 | True |
| initial_seasons.2 | 1.2875107 | s.2 | True |
| initial_seasons.3 | 1.3537338 | s.3 | True |
| initial_seasons.4 | 1.1128906 | s.4 | True |
| initial_seasons.5 | 1.3129743 | s.5 | True |
| initial_seasons.6 | 1.5035589 | s.6 | True |
| initial_seasons.7 | 1.2515672 | s.7 | True |
| initial_seasons.8 | 1.4424032 | s.8 | True |
| initial_seasons.9 | 1.4961853 | s.9 | True |
| initial_seasons.10 | 1.2179177 | s.10 | True |
| initial_seasons.11 | 1.4119162 | s.11 | True |
| initial_seasons.12 | 1.3938413 | s.12 | True |
| initial_seasons.13 | 1.4703230 | s.13 | True |
| initial_seasons.14 | 1.4563917 | s.14 | True |
| initial_seasons.15 | 1.3901391 | s.15 | True |
| initial_seasons.16 | 1.4909692 | s.16 | True |
| initial_seasons.17 | 1.2790844 | s.17 | True |
| initial_seasons.18 | 1.5066231 | s.18 | True |
| initial_seasons.19 | 1.3552973 | s.19 | True |
| initial_seasons.20 | 1.1947756 | s.20 | True |
| initial_seasons.21 | 0.9346972 | s.21 | True |
| initial_seasons.22 | 0.9289004 | s.22 | True |
| initial_seasons.23 | 1.2427760 | s.23 | True |
| initial_seasons.24 | 0.9234554 | s.24 | True |
| initial_seasons.25 | 0.9857903 | s.25 | True |
| initial_seasons.26 | 1.0587609 | s.26 | True |
| initial_seasons.27 | 0.9610390 | s.27 | True |
| initial_seasons.28 | 0.8203296 | s.28 | True |
| initial_seasons.29 | 0.7158517 | s.29 | True |
| initial_seasons.30 | 0.8293941 | s.30 | True |
| initial_seasons.31 | 0.7263256 | s.31 | True |
| initial_seasons.32 | 0.8367579 | s.32 | True |
| initial_seasons.33 | 0.7320130 | s.33 | True |
| initial_seasons.34 | 1.0538373 | s.34 | True |
| initial_seasons.35 | 1.0765308 | s.35 | True |
from statsmodels.tsa.arima.model import ARIMA
arima_model = ARIMA(train["Count of Date_Incoming_call"],order=(1,0,2)).fit()
model_df["ARIMA"] = arima_model.forecast(8)
model_df["ARIMA"]
model_df["ARIMA"]= model_df["ARIMA"].fillna(18.3)
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(model_df["ARIMA"],"g", label="ARIMA Model")
plt.legend(loc="best")
plt.title("ARIMA Forecast")
plt.show()
ARIMA_rmse = sqrt(mean_squared_error(valid["Count of Date_Incoming_call"], model_df["ARIMA"]))
ARIMA_mape = mean_absolute_error(valid["Count of Date_Incoming_call"],model_df["ARIMA"])
print("RMSE:",ARIMA_rmse)
print("MAPE:",ARIMA_mape)
RMSE: 9.305601100537038 MAPE: 7.97205743285917
model_df["ARIMA"]
Date_Of_Call 2022-09-04 18.300000 2022-09-11 17.009294 2022-09-18 15.672935 2022-09-25 15.404600 2022-10-02 15.158561 2022-10-09 14.932965 2022-10-16 14.726113 2022-10-23 14.536448 Freq: W-SUN, Name: ARIMA, dtype: float64
result =model_df["ARIMA"]
result
Date_Of_Call 2022-09-04 18.300000 2022-09-11 17.009294 2022-09-18 15.672935 2022-09-25 15.404600 2022-10-02 15.158561 2022-10-09 14.932965 2022-10-16 14.726113 2022-10-23 14.536448 Freq: W-SUN, Name: ARIMA, dtype: float64
from statsmodels.graphics.tsaplots import plot_predict plot_predict(result, start=2022-10-24, end=2022-12-18)
new_log = log_data.copy()
new_log
| Log_Values | |
|---|---|
| Month | |
| 2017-01-01 | 4.510860 |
| 2017-02-01 | 4.532599 |
| 2017-03-01 | 4.465908 |
| 2017-04-01 | 4.330733 |
| 2017-05-01 | 4.430817 |
| ... | ... |
| 2022-06-01 | 3.465736 |
| 2022-07-01 | 3.637586 |
| 2022-08-01 | 4.234107 |
| 2022-09-01 | 3.737670 |
| 2022-10-01 | 2.708050 |
70 rows × 1 columns
new_log.dtypes
new_log.index = pd.DatetimeIndex(new_log.index.values,
freq=new_log.index.inferred_freq)
log_arima_model = ARIMA(new_log,order=(1,0,2))
logAresult = log_arima_model.fit()
plt.figure(figsize=(20,8))
plt.plot(log_data,"b" ,label="Log Original Data")
plt.plot(logAresult.fittedvalues,"g", label="Log ARIMA Model")
plt.legend(loc="best")
plt.title("Log ARIMA Forecast")
plt.show()
LogARIMA_rmse = sqrt(mean_squared_error(log_data["Log_Values"],logAresult.fittedvalues)) LogARIMA_mape = mean_absolute_error(log_data["Log_Values"],logAresult.fittedvalues) print("RMSE:",LogARIMA_rmse) print("MAPE:",LogARIMA_mape)
arima_model2 = ARIMA(df3["Count of Date_Incoming_call"],order=(1,0,2)).fit()
res3 =arima_model2.fittedvalues
res3
Date_Of_Call
2017-01-01 12.040111
2017-01-08 4.132902
2017-01-15 18.380619
2017-01-22 17.321799
2017-01-29 19.069516
...
2022-09-25 18.000037
2022-10-02 16.130886
2022-10-09 10.779558
2022-10-16 7.546969
2022-10-23 5.188549
Freq: W-SUN, Length: 304, dtype: float64
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(res3,"g", label="ARIMA Model")
plt.legend(loc="best")
plt.title("ARIMA Forecast")
plt.show()
from statsmodels.graphics.tsaplots import plot_predict
fig, (ax) = plt.subplots()
ax= df3["Count of Date_Incoming_call"].plot(ax=ax)
plot_predict(arima_model2, "2022-10-24", "2022-12-18", ax=ax)
plt.title("Arima Forecast")
Text(0.5, 1.0, 'Arima Forecast')
week1 =arima_model2.predict("2022-10-24","2022-10-30")
print("Prediction for Week 1:",week1)
week2= arima_model2.predict("2022-10-31", "2022-11-06")
print("Prediction for Week 2:",week2)
week3= arima_model2.predict("2022-11-07", "2022-11-13")
print("Prediction for Week 3:",week3)
week4= arima_model2.predict("2022-11-14", "2022-11-20")
print("Prediction for Week 4:",week4)
week5= arima_model2.predict("2022-11-21", "2022-11-27")
print("Prediction for Week 5:",week5)
week6= arima_model2.predict("2022-11-28", "2022-12-04")
print("Prediction for Week 6:",week6)
week7= arima_model2.predict("2022-12-05", "2022-12-11")
print("Prediction for Week 7:",week7)
week8= arima_model2.predict("2022-12-12", "2022-12-18")
print("Prediction for Week 8:",week8)
Prediction for Week 1: 2022-10-30 5.451387 Freq: W-SUN, dtype: float64 Prediction for Week 2: 2022-11-06 6.403464 Freq: W-SUN, dtype: float64 Prediction for Week 3: 2022-11-13 6.977868 Freq: W-SUN, dtype: float64 Prediction for Week 4: 2022-11-20 7.493737 Freq: W-SUN, dtype: float64 Prediction for Week 5: 2022-11-27 7.957037 Freq: W-SUN, dtype: float64 Prediction for Week 6: 2022-12-04 8.373124 Freq: W-SUN, dtype: float64 Prediction for Week 7: 2022-12-11 8.746809 Freq: W-SUN, dtype: float64 Prediction for Week 8: 2022-12-18 9.082414 Freq: W-SUN, dtype: float64
2022-10-30 5.451387 Freq: W-SUN, dtype: float64